# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv('bank-additional-full.csv', sep=';')
data.head() # it give top 5 data
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
5 rows × 21 columns
data.tail() # it's give last 5 rows
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41183 | 73 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
| 41184 | 46 | blue-collar | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
| 41185 | 56 | retired | married | university.degree | no | yes | no | cellular | nov | fri | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
| 41186 | 44 | technician | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
| 41187 | 74 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 3 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
5 rows × 21 columns
data.columns
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
dtype='object')
data['y'] = data['y'].replace({'yes':1, 'no':0}) # here replacing categorical to numerical
data['y'] = data['y'].astype('int64')
data
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 73 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41184 | 46 | blue-collar | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41185 | 56 | retired | married | university.degree | no | yes | no | cellular | nov | fri | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41186 | 44 | technician | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41187 | 74 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 3 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
41188 rows × 21 columns
data.rename(columns= {'emp.var.rate':'emp_var_rate', 'cons.price.idx': 'cons_price_idx', 'cons.conf.idx':'cons_conf_idx',
'nr.employed':'nr_employed'}, inplace = True)
data.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
5 rows × 21 columns
data.describe() # it's give numerical data here, we find std, mean, median, min and max values
| age | duration | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 41188.00000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 | 41188.000000 |
| mean | 40.02406 | 258.285010 | 2.567593 | 962.475454 | 0.172963 | 0.081886 | 93.575664 | -40.502600 | 3.621291 | 5167.035911 | 0.112654 |
| std | 10.42125 | 259.279249 | 2.770014 | 186.910907 | 0.494901 | 1.570960 | 0.578840 | 4.628198 | 1.734447 | 72.251528 | 0.316173 |
| min | 17.00000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | -3.400000 | 92.201000 | -50.800000 | 0.634000 | 4963.600000 | 0.000000 |
| 25% | 32.00000 | 102.000000 | 1.000000 | 999.000000 | 0.000000 | -1.800000 | 93.075000 | -42.700000 | 1.344000 | 5099.100000 | 0.000000 |
| 50% | 38.00000 | 180.000000 | 2.000000 | 999.000000 | 0.000000 | 1.100000 | 93.749000 | -41.800000 | 4.857000 | 5191.000000 | 0.000000 |
| 75% | 47.00000 | 319.000000 | 3.000000 | 999.000000 | 0.000000 | 1.400000 | 93.994000 | -36.400000 | 4.961000 | 5228.100000 | 0.000000 |
| max | 98.00000 | 4918.000000 | 56.000000 | 999.000000 | 7.000000 | 1.400000 | 94.767000 | -26.900000 | 5.045000 | 5228.100000 | 1.000000 |
data.describe(include='O') # here, we describe categorical data
| job | marital | education | default | housing | loan | contact | month | day_of_week | poutcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 41188 | 41188 | 41188 | 41188 | 41188 | 41188 | 41188 | 41188 | 41188 | 41188 |
| unique | 12 | 4 | 8 | 3 | 3 | 3 | 2 | 10 | 5 | 3 |
| top | admin. | married | university.degree | no | yes | no | cellular | may | thu | nonexistent |
| freq | 10422 | 24928 | 12168 | 32588 | 21576 | 33950 | 26144 | 13769 | 8623 | 35563 |
data.info(), # Describing datatypes, entries and non-null values
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41188 entries, 0 to 41187 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 41188 non-null int64 1 job 41188 non-null object 2 marital 41188 non-null object 3 education 41188 non-null object 4 default 41188 non-null object 5 housing 41188 non-null object 6 loan 41188 non-null object 7 contact 41188 non-null object 8 month 41188 non-null object 9 day_of_week 41188 non-null object 10 duration 41188 non-null int64 11 campaign 41188 non-null int64 12 pdays 41188 non-null int64 13 previous 41188 non-null int64 14 poutcome 41188 non-null object 15 emp_var_rate 41188 non-null float64 16 cons_price_idx 41188 non-null float64 17 cons_conf_idx 41188 non-null float64 18 euribor3m 41188 non-null float64 19 nr_employed 41188 non-null float64 20 y 41188 non-null int64 dtypes: float64(5), int64(6), object(10) memory usage: 6.6+ MB
(None,)
data.dtypes
age int64 job object marital object education object default object housing object loan object contact object month object day_of_week object duration int64 campaign int64 pdays int64 previous int64 poutcome object emp_var_rate float64 cons_price_idx float64 cons_conf_idx float64 euribor3m float64 nr_employed float64 y int64 dtype: object
data.shape # describing how many rows and columns. here, 41188 are rows and 21 are columns
(41188, 21)
data.duplicated().sum() # describing duplicate
12
data.y.value_counts()[0] # Number of client that haven't subscribed the term deposit
36548
data.y.value_counts()[1] # Number of clients that have subscribed the term deposit
4640
data.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
5 rows × 21 columns
# Analyzing a age through univariate
plt.figure(figsize=(20,20))
sns.countplot(x=data['age'], data=data)
plt.show()
data.columns
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
dtype='object')
# Analyzing the all numerical columns using histplot
import warnings
warnings.filterwarnings('ignore')
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate','cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed','y']]
plt.figure(figsize=(10,100),facecolor="white")
plotnumber=1
for column in data1:
if plotnumber<=10:
ax=plt.subplot(30,1,plotnumber)
sns.countplot(x=data1[column])
plt.xlabel(column,fontsize=20)
plotnumber+=1
plt.tight_layout()
plt.show()
# analyze the categorical column using histplot
import warnings
warnings.filterwarnings('ignore')
data2=data[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']]
plt.figure(figsize=(10,100), facecolor= 'white')
plotnumber=1
for column in data2:
if plotnumber<=11:
ax=plt.subplot(20,1, plotnumber)
sns.countplot(x=data2[column])
plt.xlabel(column, fontsize=20)
plotnumber+=1
plt.tight_layout()
plt.show()
# Analyzing the numerical data using pieolot
data1 = data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate','cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed','y']]
plt.figure(figsize=(200,200))
plotnumber=1
for column in data1:
if plotnumber <= 10:
ax = plt.subplot(21,1, plotnumber)
ax.pie(data1[column].value_counts(normalize=True), labels=data[column].value_counts().index,autopct='%1.1f%%')
ax.set_title(column,fontsize=25)
plotnumber+=1
plt.tight_layout()
plt.show()
# Analyzing categorical data using pieplot
data2=data[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']]
plt.figure(figsize=(100,100))
plotnumber=1
for column in data2:
if plotnumber<=10:
ax = plt.subplot(21,1, plotnumber)
ax.pie(data2[column].value_counts(normalize=True), labels=data[column].value_counts().index, autopct='%1.1f%%')
ax.set_title(column, fontsize=20)
plotnumber+=1
plt.tight_layout()
plt.show()
# Analysing the numerical data using histplot
import warnings
warnings.filterwarnings('ignore')
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate','cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed','y']]
plt.figure(figsize=(10,80))
plotnumber=1
for column in data1:
if plotnumber <=10:
ax=plt.subplot(10,1, plotnumber)
sns.histplot(x=data1[column], hue=data.y, kde=True)
plt.xlabel(column, fontsize=25)
plt.ylabel('y', fontsize=25)
plotnumber+=1
plt.show()
# Analyzing categorical data using histplot
data2=data[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']]
plt.figure(figsize=(20,200))
plotnumber=1
for column in data2:
if plotnumber<=10:
ax=plt.subplot(10,1,plotnumber)
sns.histplot(x=data2[column], kde=True, hue=data.y)
plt.xlabel(column, fontsize=20)
plt.ylabel('y', fontsize=20)
plotnumber+=1
plt.show()
# Analyzing Numerical analysis using distplot
plt.figure(figsize=(10,100), facecolor='white')
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate','cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed','y']]
plotnumber=1
for column in data1:
if plotnumber<=10:
ax=plt.subplot(10,1, plotnumber)
sns.distplot(x=data1[column])
plt.xlabel(column, fontsize=20)
plotnumber+=1
plt.show()
# Checking outliers using boxplot
import warnings
warnings.filterwarnings('ignore')
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate','cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']]
plt.figure(figsize=(10,150), facecolor='white')
plotnumber=1
for column in data1:
if plotnumber<=11:
ax=plt.subplot(11,1, plotnumber)
sns.boxplot(data1[column])
plt.xlabel(column, fontsize=20)
plotnumber+=1
plt.show()
!pip install sweetviz
Requirement already satisfied: sweetviz in c:\users\amanp\anaconda3\lib\site-packages (2.1.4) Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,>=0.25.3 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (1.5.3) Requirement already satisfied: matplotlib>=3.1.3 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (3.7.0) Requirement already satisfied: jinja2>=2.11.1 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (3.1.2) Requirement already satisfied: tqdm>=4.43.0 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (4.64.1) Requirement already satisfied: importlib-resources>=1.2.0 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (5.12.0) Requirement already satisfied: numpy>=1.16.0 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (1.23.5) Requirement already satisfied: scipy>=1.3.2 in c:\users\amanp\anaconda3\lib\site-packages (from sweetviz) (1.10.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\amanp\anaconda3\lib\site-packages (from jinja2>=2.11.1->sweetviz) (2.1.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (9.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (3.0.9) Requirement already satisfied: packaging>=20.0 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (22.0) Requirement already satisfied: cycler>=0.10 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (0.11.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (1.0.5) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (1.4.4) Requirement already satisfied: python-dateutil>=2.7 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (2.8.2) Requirement already satisfied: fonttools>=4.22.0 in c:\users\amanp\anaconda3\lib\site-packages (from matplotlib>=3.1.3->sweetviz) (4.25.0) Requirement already satisfied: pytz>=2020.1 in c:\users\amanp\anaconda3\lib\site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,>=0.25.3->sweetviz) (2022.7) Requirement already satisfied: colorama in c:\users\amanp\anaconda3\lib\site-packages (from tqdm>=4.43.0->sweetviz) (0.4.6) Requirement already satisfied: six>=1.5 in c:\users\amanp\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.1.3->sweetviz) (1.16.0)
import sweetviz as sv
my_report=sv.analyze(data)
my_report.show_html()
| | [ 0%] 00:00 -> (? left)
Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
data
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 73 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41184 | 46 | blue-collar | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41185 | 56 | retired | married | university.degree | no | yes | no | cellular | nov | fri | ... | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41186 | 44 | technician | married | professional.course | no | no | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41187 | 74 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 3 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
41188 rows × 21 columns
# Bivariate analyzing using countplot
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(20,50), facecolor='white')
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate','cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']]
plotnumber=1
for column in data1:
if plotnumber<=11:
ax=plt.subplot(10,2, plotnumber)
sns.countplot(x=data1[column], hue=data['y'])
plt.xlabel(column, fontsize=20)
plotnumber+=1
plt.show()
sns.pairplot(data,hue='y',palette='crest', size=3)
<seaborn.axisgrid.PairGrid at 0x1a5df828850>
# Checking correlation
plt.figure(figsize=(10,20))
sns.heatmap(data.corr(), annot=True, linewidth=0.03)
<Axes: >
data.corr()
| age | duration | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.000866 | 0.004594 | -0.034369 | 0.024365 | -0.000371 | 0.000857 | 0.129372 | 0.010767 | -0.017725 | 0.030399 |
| duration | -0.000866 | 1.000000 | -0.071699 | -0.047577 | 0.020640 | -0.027968 | 0.005312 | -0.008173 | -0.032897 | -0.044703 | 0.405274 |
| campaign | 0.004594 | -0.071699 | 1.000000 | 0.052584 | -0.079141 | 0.150754 | 0.127836 | -0.013733 | 0.135133 | 0.144095 | -0.066357 |
| pdays | -0.034369 | -0.047577 | 0.052584 | 1.000000 | -0.587514 | 0.271004 | 0.078889 | -0.091342 | 0.296899 | 0.372605 | -0.324914 |
| previous | 0.024365 | 0.020640 | -0.079141 | -0.587514 | 1.000000 | -0.420489 | -0.203130 | -0.050936 | -0.454494 | -0.501333 | 0.230181 |
| emp_var_rate | -0.000371 | -0.027968 | 0.150754 | 0.271004 | -0.420489 | 1.000000 | 0.775334 | 0.196041 | 0.972245 | 0.906970 | -0.298334 |
| cons_price_idx | 0.000857 | 0.005312 | 0.127836 | 0.078889 | -0.203130 | 0.775334 | 1.000000 | 0.058986 | 0.688230 | 0.522034 | -0.136211 |
| cons_conf_idx | 0.129372 | -0.008173 | -0.013733 | -0.091342 | -0.050936 | 0.196041 | 0.058986 | 1.000000 | 0.277686 | 0.100513 | 0.054878 |
| euribor3m | 0.010767 | -0.032897 | 0.135133 | 0.296899 | -0.454494 | 0.972245 | 0.688230 | 0.277686 | 1.000000 | 0.945154 | -0.307771 |
| nr_employed | -0.017725 | -0.044703 | 0.144095 | 0.372605 | -0.501333 | 0.906970 | 0.522034 | 0.100513 | 0.945154 | 1.000000 | -0.354678 |
| y | 0.030399 | 0.405274 | -0.066357 | -0.324914 | 0.230181 | -0.298334 | -0.136211 | 0.054878 | -0.307771 | -0.354678 | 1.000000 |
data.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp_var_rate 0 cons_price_idx 0 cons_conf_idx 0 euribor3m 0 nr_employed 0 y 0 dtype: int64
# Checking outliers in ['Age']
sns.boxplot(data['age'])
<Axes: >
Q1 = data['age'].quantile(q=.25)
Q3 = data['age'].quantile(q=.75)
print('Q1 is: ', Q1)
print('Q3 is: ', Q3)
Q1 is: 32.0 Q3 is: 47.0
# IQR = Q3-Q1
# Lower 1.5*IQR IS Q1-1.5*IQR
# Upper 1.5*IQR IS Q3+1.5*IQR
l_outlier = Q1-1.5*(Q3-Q1)
u_outlier = Q3+1.5*(Q3-Q1)
print('l_outlier is: ', l_outlier)
print('u_outlier is: ', u_outlier)
l_outlier is: 9.5 u_outlier is: 69.5
# number of outliers
print('number of outliers in age upper: ', data[data['age']>69.5]['age'].count())
print('number of outliers in age lower: ',data[data['age']<9.5]['age'].count())
number of outliers in age upper: 469 number of outliers in age lower: 0
data.loc[data['age']>69.5]
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27713 | 70 | retired | divorced | basic.4y | no | yes | no | cellular | mar | mon | ... | 3 | 999 | 0 | nonexistent | -1.8 | 92.843 | -50.0 | 1.811 | 5099.1 | 1 |
| 27757 | 76 | retired | married | university.degree | no | no | yes | cellular | mar | thu | ... | 9 | 999 | 0 | nonexistent | -1.8 | 92.843 | -50.0 | 1.757 | 5099.1 | 0 |
| 27780 | 73 | retired | married | university.degree | no | yes | no | cellular | mar | tue | ... | 1 | 999 | 1 | failure | -1.8 | 92.843 | -50.0 | 1.687 | 5099.1 | 0 |
| 27800 | 88 | retired | divorced | basic.4y | no | yes | no | cellular | mar | wed | ... | 1 | 999 | 0 | nonexistent | -1.8 | 92.843 | -50.0 | 1.663 | 5099.1 | 0 |
| 27802 | 88 | retired | divorced | basic.4y | no | no | no | cellular | mar | wed | ... | 2 | 999 | 0 | nonexistent | -1.8 | 92.843 | -50.0 | 1.663 | 5099.1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 40986 | 84 | retired | divorced | basic.4y | unknown | yes | yes | cellular | oct | mon | ... | 4 | 3 | 1 | success | -1.1 | 94.601 | -49.5 | 1.000 | 4963.6 | 0 |
| 40996 | 81 | retired | married | basic.4y | no | yes | no | cellular | oct | wed | ... | 1 | 999 | 2 | failure | -1.1 | 94.601 | -49.5 | 1.016 | 4963.6 | 1 |
| 41004 | 80 | retired | married | professional.course | no | yes | no | cellular | oct | thu | ... | 1 | 999 | 1 | failure | -1.1 | 94.601 | -49.5 | 1.025 | 4963.6 | 1 |
| 41183 | 73 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41187 | 74 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 3 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
469 rows × 21 columns
median = data.loc[data['age']<69.5, 'age'].median()
median
38.0
data.loc[data.age>69.5, 'age']=np.nan
data['age'].fillna(data['age'].median(), inplace=True)
sns.boxplot(data['age'])
<Axes: >
# checking outlier in duration
sns.boxplot(data['duration'])
<Axes: >
# duration
Q1 = data['duration'].quantile(q=.25)
Q3 = data['duration'].quantile(q=.75)
print('Q1 is: ', Q1)
print('Q3 is: ', Q3)
Q1 is: 102.0 Q3 is: 319.0
# IQR = Q3-Q1
# LOWER = 1.5*IQR IS Q1-1.5*IQR
# UPPER = 1.5*IQR IS Q3+1.5*IQR
l_outlier = Q1-1.5*(Q3-Q1)
u_outlier = Q3+1.5*(Q3-Q1)
print('l_outlier is: ', l_outlier)
print('u_outlier is: ', u_outlier)
l_outlier is: -223.5 u_outlier is: 644.5
print('Number of outlier in duration upper is: ', data[data['duration']>644.5]['duration'].count())
print('number of outlier in duration lower is: ', data[data['duration']<-223.5]['duration'].count())
Number of outlier in duration upper is: 2963 number of outlier in duration lower is: 0
data.loc[data['duration']>644.5]
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 37 | 52.0 | technician | married | basic.9y | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 57 | 45.0 | services | married | high.school | unknown | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 61 | 51.0 | blue-collar | married | basic.9y | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 75 | 41.0 | blue-collar | divorced | basic.4y | unknown | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 1 |
| 83 | 49.0 | entrepreneur | married | university.degree | unknown | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41147 | 25.0 | technician | single | professional.course | no | yes | no | cellular | nov | fri | ... | 2 | 19 | 1 | success | -1.1 | 94.767 | -50.8 | 1.040 | 4963.6 | 1 |
| 41153 | 67.0 | housemaid | divorced | professional.course | no | yes | no | cellular | nov | mon | ... | 2 | 5 | 5 | success | -1.1 | 94.767 | -50.8 | 1.039 | 4963.6 | 1 |
| 41160 | 33.0 | admin. | married | university.degree | no | no | no | cellular | nov | tue | ... | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | 1 |
| 41164 | 54.0 | admin. | married | professional.course | no | no | no | cellular | nov | tue | ... | 2 | 10 | 1 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | 1 |
| 41166 | 32.0 | admin. | married | university.degree | no | no | no | telephone | nov | wed | ... | 1 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.030 | 4963.6 | 1 |
2963 rows × 21 columns
median = data.loc[data['duration']<644.5, 'duration'].median()
median
167.0
data.loc[data['duration']>644.5, 'duration']=np.nan
data['duration'].fillna(data['duration'].median(), inplace = True)
sns.boxplot(data['duration'])
<Axes: >
# Campaign
sns.boxplot(data['campaign'])
<Axes: >
Q1 = data['campaign'].quantile(q=.25)
Q3 = data['campaign'].quantile(q=.75)
print('Q1 is: ', Q1)
print('Q3 is: ', Q3)
Q1 is: 1.0 Q3 is: 3.0
# IQR =Q3-Q1
# LOWER = 1.5*IQR IS Q1-1.5*IQR
# UPPER = 1.5*IQR IS Q3-1.5*IQR
l_outlier = Q1- 1.5*(Q3-Q1)
u_outlier = Q3+ 1.5*(Q3-Q1)
print('l_outlier is: ', l_outlier)
print('u_outlier is: ', u_outlier)
l_outlier is: -2.0 u_outlier is: 6.0
print('number of outlier in campaign upper is: ', data[data['campaign']>6.0]['campaign'].count())
print('number of outlier in camapaign lower is: ', data[data['campaign']<(-2.0)]['campaign'].count())
number of outlier in campaign upper is: 2406 number of outlier in camapaign lower is: 0
data.loc[data['campaign']>6.0]
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 887 | 54.0 | admin. | married | university.degree | no | no | no | telephone | may | wed | ... | 7 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.856 | 5191.0 | 0 |
| 1043 | 41.0 | technician | single | university.degree | unknown | no | no | telephone | may | wed | ... | 8 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.856 | 5191.0 | 0 |
| 1094 | 40.0 | technician | married | professional.course | no | yes | no | telephone | may | wed | ... | 8 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.856 | 5191.0 | 0 |
| 1097 | 51.0 | blue-collar | married | basic.4y | unknown | yes | no | telephone | may | wed | ... | 7 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.856 | 5191.0 | 0 |
| 1230 | 48.0 | blue-collar | divorced | basic.4y | no | no | no | telephone | may | thu | ... | 7 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.855 | 5191.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 40611 | 38.0 | retired | married | basic.9y | no | no | no | telephone | sep | fri | ... | 9 | 999 | 1 | failure | -1.1 | 94.199 | -37.5 | 0.883 | 4963.6 | 0 |
| 40631 | 38.0 | retired | married | basic.4y | no | yes | no | cellular | sep | mon | ... | 7 | 999 | 0 | nonexistent | -1.1 | 94.199 | -37.5 | 0.882 | 4963.6 | 0 |
| 40698 | 29.0 | technician | single | professional.course | no | yes | no | cellular | sep | mon | ... | 7 | 3 | 5 | success | -1.1 | 94.199 | -37.5 | 0.879 | 4963.6 | 0 |
| 40821 | 33.0 | technician | single | professional.course | no | yes | no | cellular | sep | fri | ... | 9 | 999 | 2 | failure | -1.1 | 94.199 | -37.5 | 0.879 | 4963.6 | 0 |
| 40993 | 50.0 | entrepreneur | divorced | university.degree | no | yes | no | telephone | oct | tue | ... | 7 | 11 | 2 | success | -1.1 | 94.601 | -49.5 | 1.008 | 4963.6 | 1 |
2406 rows × 21 columns
median = data.loc[data['campaign']<6.0, 'campaign'].median()
median
2.0
data.loc[data['campaign']>6.0, 'campaign']=np.nan
data['campaign'].fillna(data['campaign'].median(), inplace=True)
sns.boxplot(data['campaign'])
<Axes: >
# Checking pdays
sns.boxplot(data['pdays'])
<Axes: >
Q1 = data['pdays'].quantile(q=.25)
Q3 = data['pdays'].quantile(q=.75)
print('Q1 is: ', Q1)
print('Q3 is: ', Q3)
Q1 is: 999.0 Q3 is: 999.0
# IQR = Q3-Q1
# upper = 1.5*IQR IS Q3+1.5*IQR
# lower = 1.5*IQR IS Q1-1.5*IQR
l_outlier = Q3+1.5*(Q3-Q1)
u_outlier = Q1- 1.5*(Q3-Q1)
print('l_outlier is: ', l_outlier)
print('u_outlier is: ', u_outlier)
l_outlier is: 999.0 u_outlier is: 999.0
print('number of outlier in pdays upper is: ', data[data['pdays']>999.0]['pdays'].count())
print('number of outlier in pdays lower is: ', data[data['pdays']<999.0]['pdays'].count())
number of outlier in pdays upper is: 0 number of outlier in pdays lower is: 1515
data.loc[data['pdays']<999.0]
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 24108 | 37.0 | admin. | married | university.degree | no | yes | no | telephone | nov | wed | ... | 1.0 | 6 | 1 | success | -0.1 | 93.200 | -42.0 | 4.286 | 5195.8 | 0 |
| 24264 | 48.0 | management | married | university.degree | no | yes | no | cellular | nov | mon | ... | 1.0 | 4 | 1 | success | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | 0 |
| 24279 | 42.0 | technician | married | professional.course | unknown | yes | no | cellular | nov | mon | ... | 1.0 | 4 | 1 | success | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | 0 |
| 24397 | 50.0 | entrepreneur | married | university.degree | unknown | yes | no | cellular | nov | mon | ... | 1.0 | 3 | 1 | success | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | 0 |
| 24482 | 36.0 | self-employed | single | university.degree | no | yes | no | cellular | nov | mon | ... | 2.0 | 4 | 1 | success | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41163 | 35.0 | technician | divorced | basic.4y | no | yes | no | cellular | nov | tue | ... | 3.0 | 4 | 2 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | 1 |
| 41164 | 54.0 | admin. | married | professional.course | no | no | no | cellular | nov | tue | ... | 2.0 | 10 | 1 | success | -1.1 | 94.767 | -50.8 | 1.035 | 4963.6 | 1 |
| 41174 | 62.0 | retired | married | university.degree | no | yes | no | cellular | nov | thu | ... | 1.0 | 1 | 6 | success | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | 1 |
| 41178 | 62.0 | retired | married | university.degree | no | no | no | cellular | nov | thu | ... | 2.0 | 6 | 3 | success | -1.1 | 94.767 | -50.8 | 1.031 | 4963.6 | 1 |
| 41182 | 29.0 | unemployed | single | basic.4y | no | yes | no | cellular | nov | fri | ... | 1.0 | 9 | 1 | success | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
1515 rows × 21 columns
median = data.loc[data['pdays']<999.0, 'pdays'].median()
median
6.0
data.loc[data['pdays']<999.0, 'pdays']=np.nan
data['pdays'].fillna(data['pdays'].median(), inplace=True)
sns.boxplot(data['pdays'])
<Axes: >
# Checking outlier in cons_conf_idx
sns.boxplot(data['cons_conf_idx'])
<Axes: >
Q1 = data['cons_conf_idx'].quantile(.25)
Q3 = data['cons_conf_idx'].quantile(.75)
print('Q1 is: ', Q1)
print('Q3 is: ', Q3)
Q1 is: -42.7 Q3 is: -36.4
# IQR = Q3-Q1
# lower = 1.5*IQR is Q1-1.5*IQR
# upper = 1.5*IQR is Q3-1.5*IQR
l_outlier = Q1-1.5*(Q3-Q1)
u_outlier = Q3+1.5*(Q3-Q1)
print('l_outlier is: ', l_outlier)
print('u_outlier is: ', u_outlier)
l_outlier is: -52.150000000000006 u_outlier is: -26.949999999999992
print('number of lower outlier in cons_conf_idx is: ', data[data['cons_conf_idx']<(-52.150000000000006)]['cons_conf_idx'].count())
print('number of upper outlier in cons_conf_idx is: ', data[data['cons_conf_idx']>(-26.949999999999992)]['cons_conf_idx'].count())
number of lower outlier in cons_conf_idx is: 0 number of upper outlier in cons_conf_idx is: 447
data.loc[data['cons_conf_idx']> (-26.949999999999992)]
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 38154 | 50.0 | management | married | university.degree | no | yes | no | cellular | oct | thu | ... | 2.0 | 999.0 | 1 | success | -3.4 | 92.431 | -26.9 | 0.754 | 5017.5 | 1 |
| 38155 | 37.0 | admin. | single | university.degree | no | yes | no | cellular | oct | thu | ... | 3.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.754 | 5017.5 | 1 |
| 38156 | 59.0 | technician | single | basic.6y | no | no | no | cellular | oct | thu | ... | 1.0 | 999.0 | 2 | failure | -3.4 | 92.431 | -26.9 | 0.754 | 5017.5 | 0 |
| 38157 | 31.0 | admin. | married | university.degree | no | yes | no | cellular | oct | thu | ... | 1.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.754 | 5017.5 | 0 |
| 38158 | 35.0 | admin. | married | high.school | no | yes | no | cellular | oct | thu | ... | 1.0 | 999.0 | 1 | success | -3.4 | 92.431 | -26.9 | 0.754 | 5017.5 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 38596 | 69.0 | retired | married | basic.4y | no | yes | yes | cellular | oct | fri | ... | 3.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.720 | 5017.5 | 1 |
| 38597 | 18.0 | student | single | basic.6y | no | no | yes | cellular | oct | fri | ... | 2.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.720 | 5017.5 | 1 |
| 38598 | 59.0 | retired | divorced | basic.4y | no | yes | no | telephone | oct | fri | ... | 4.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.720 | 5017.5 | 0 |
| 38599 | 37.0 | admin. | single | university.degree | no | no | no | cellular | oct | fri | ... | 2.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.720 | 5017.5 | 1 |
| 38600 | 38.0 | retired | divorced | basic.6y | no | no | no | telephone | oct | fri | ... | 2.0 | 999.0 | 0 | nonexistent | -3.4 | 92.431 | -26.9 | 0.720 | 5017.5 | 0 |
447 rows × 21 columns
median= data.loc[data['cons_conf_idx']<(-26.949999999999992),'cons_conf_idx'].median()
median
-41.8
data.loc[data['cons_conf_idx']>(-26.949999999999992), 'cons_conf_idx']=np.nan
data['cons_conf_idx'].fillna(data['cons_conf_idx'].median(), inplace=True)
sns.boxplot(data['cons_conf_idx'])
<Axes: >
data
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56.0 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1.0 | 999.0 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57.0 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1.0 | 999.0 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37.0 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1.0 | 999.0 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40.0 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1.0 | 999.0 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56.0 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1.0 | 999.0 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 38.0 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 1.0 | 999.0 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41184 | 46.0 | blue-collar | married | professional.course | no | no | no | cellular | nov | fri | ... | 1.0 | 999.0 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41185 | 56.0 | retired | married | university.degree | no | yes | no | cellular | nov | fri | ... | 2.0 | 999.0 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41186 | 44.0 | technician | married | professional.course | no | no | no | cellular | nov | fri | ... | 1.0 | 999.0 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41187 | 38.0 | retired | married | professional.course | no | yes | no | cellular | nov | fri | ... | 3.0 | 999.0 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
41188 rows × 21 columns
df=data.copy()
df['job'].replace(['housemaid','services','admin.', 'blue-collar', 'technician' ,'retired','management', 'unemployed', 'self-employed', 'unknown' ,'entrepreneur','student'],[0,1,2,3,4,5,6,7,8,9,10,11],inplace=True)
df['default'].replace([ 'no','yes','unknown'],[7,8,9], inplace=True)
df['education'].replace(['basic.4y','basic.6y','basic.9y','professional.course', 'unknown', 'high.school','university.degree','illiterate'],[3,2,0,1,4,5,6,7],inplace=True)
df['marital'].replace([ 'divorced','married','single','unknown'],[10,11,12,13], inplace=True)
df['housing'].replace(['no','yes','unknown'],[15,16,17], inplace=True)
df['loan'].replace([ 'no','yes','unknown'],[0,2,4], inplace=True)
df['contact'].replace([ 'cellular','telephone'], [7,9], inplace=True)
df['month'].replace(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb', 'mar', 'apr', 'sep'],[5,6,7,8,10,11,12,1,2,3,4,9],inplace=True)
df['day_of_week'].replace([ 'mon','tue','wed','thu','fri'], [7,6,5,4,3], inplace=True)
df['poutcome'].replace(['failure','nonexistent','success'],[4,2,7], inplace=True)
df
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56.0 | 0 | 11 | 3 | 7 | 15 | 0 | 9 | 5 | 7 | ... | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57.0 | 1 | 11 | 5 | 9 | 15 | 0 | 9 | 5 | 7 | ... | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37.0 | 1 | 11 | 5 | 7 | 16 | 0 | 9 | 5 | 7 | ... | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40.0 | 2 | 11 | 2 | 7 | 15 | 0 | 9 | 5 | 7 | ... | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56.0 | 1 | 11 | 5 | 7 | 15 | 2 | 9 | 5 | 7 | ... | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 38.0 | 5 | 11 | 1 | 7 | 16 | 0 | 7 | 11 | 3 | ... | 1.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41184 | 46.0 | 3 | 11 | 1 | 7 | 15 | 0 | 7 | 11 | 3 | ... | 1.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41185 | 56.0 | 5 | 11 | 6 | 7 | 16 | 0 | 7 | 11 | 3 | ... | 2.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
| 41186 | 44.0 | 4 | 11 | 1 | 7 | 15 | 0 | 7 | 11 | 3 | ... | 1.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 1 |
| 41187 | 38.0 | 5 | 11 | 1 | 7 | 16 | 0 | 7 | 11 | 3 | ... | 3.0 | 999.0 | 1 | 4 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | 0 |
41188 rows × 21 columns
X=df.drop(columns=['y'])
Y=df['y']
X
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56.0 | 0 | 11 | 3 | 7 | 15 | 0 | 9 | 5 | 7 | 261.0 | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 1 | 57.0 | 1 | 11 | 5 | 9 | 15 | 0 | 9 | 5 | 7 | 149.0 | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 2 | 37.0 | 1 | 11 | 5 | 7 | 16 | 0 | 9 | 5 | 7 | 226.0 | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 3 | 40.0 | 2 | 11 | 2 | 7 | 15 | 0 | 9 | 5 | 7 | 151.0 | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 4 | 56.0 | 1 | 11 | 5 | 7 | 15 | 2 | 9 | 5 | 7 | 307.0 | 1.0 | 999.0 | 0 | 2 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 38.0 | 5 | 11 | 1 | 7 | 16 | 0 | 7 | 11 | 3 | 334.0 | 1.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 |
| 41184 | 46.0 | 3 | 11 | 1 | 7 | 15 | 0 | 7 | 11 | 3 | 383.0 | 1.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 |
| 41185 | 56.0 | 5 | 11 | 6 | 7 | 16 | 0 | 7 | 11 | 3 | 189.0 | 2.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 |
| 41186 | 44.0 | 4 | 11 | 1 | 7 | 15 | 0 | 7 | 11 | 3 | 442.0 | 1.0 | 999.0 | 0 | 2 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 |
| 41187 | 38.0 | 5 | 11 | 1 | 7 | 16 | 0 | 7 | 11 | 3 | 239.0 | 3.0 | 999.0 | 1 | 4 | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 |
41188 rows × 20 columns
# Scaling the data as all feature seems to be near to normal distribution
from sklearn.preprocessing import StandardScaler
Scaler= StandardScaler() # object creation
X_scaled = Scaler.fit_transform(X)
X_scaled
array([[ 1.70658113, -1.49696895, -0.2837415 , ..., 0.9680674 ,
0.71245988, 0.33167991],
[ 1.81051927, -1.08948536, -0.2837415 , ..., 0.9680674 ,
0.71245988, 0.33167991],
[-0.26824358, -1.08948536, -0.2837415 , ..., 0.9680674 ,
0.71245988, 0.33167991],
...,
[ 1.70658113, 0.540449 , -0.2837415 , ..., -2.30096943,
-1.49518647, -2.8156966 ],
[ 0.45932342, 0.13296541, -0.2837415 , ..., -2.30096943,
-1.49518647, -2.8156966 ],
[-0.16430544, 0.540449 , -0.2837415 , ..., -2.30096943,
-1.49518647, -2.8156966 ]])
from sklearn.model_selection import train_test_split
X_train, X_test ,Y_train, Y_test =train_test_split(X_scaled,Y, test_size=0.25, random_state=42 )
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train,Y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
# Predicting Y_train
Y_train_pred = log_reg.predict(X_train)
Y_train_pred
array([0, 0, 0, ..., 1, 0, 0], dtype=int64)
# Testing model
Y_pred = log_reg.predict(X_test)
Y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Known the shape
Y_train.shape
(30891,)
Y_pred.shape
(10297,)
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,classification_report
accuracy = accuracy_score(Y_train, Y_train_pred)
accuracy
0.9033051697905539
from imblearn.over_sampling import SMOTE
smote=SMOTE() # Object creatuin
X_train_smote, Y_train_smote = smote.fit_resample(X_train.astype('float'),Y_train)
from collections import Counter
print('Actual classes', Counter(Y_train))
print('Actual classes', Counter(Y_train_smote))
Actual classes Counter({0: 27404, 1: 3487})
Actual classes Counter({0: 27404, 1: 27404})
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_smote, Y_train_smote)
KNeighborsClassifier(n_neighbors=7)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=7)
Y_pred = knn.predict(X_test)
error_rate = []
for i in range(1,11):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train_smote,Y_train_smote)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != Y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,11),error_rate,color='green',linestyle='dashed',marker='s',markerfacecolor='purple',markersize=10)
plt.title('Error Rate vs. K value')
plt.xlabel('k')
plt.ylabel('Error Rate')
Text(0, 0.5, 'Error Rate')
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_smote,Y_train_smote)
Y_pred=knn.predict(X_test)
print("The accuracy score is : ", accuracy_score(Y_test,Y_pred))
The accuracy score is : 0.7686704865494804
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,Y_train)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
Y_pred_sv=svm.predict(X_test)
print(accuracy_score(Y_test,Y_pred_sv))
0.9052151111974361
from sklearn.ensemble import RandomForestClassifier
R_forest = RandomForestClassifier()
R_forest.fit(X_train,Y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
Y_pred = R_forest.predict(X_test)
Y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
(Y_test == Y_pred).sum()/len(Y_test)*100
91.19160920656502
print('accuracy_score is: ', accuracy_score(Y_test,Y_pred))
accuracy_score is: 0.9119160920656502
from sklearn.ensemble import GradientBoostingClassifier
XGB = GradientBoostingClassifier()
XGB.fit(X_train,Y_train)
GradientBoostingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier()
Y_pred = XGB.predict(X_test)
Y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
print('Accuracy_score is: ', accuracy_score(Y_test, Y_pred))
Accuracy_score is: 0.9119160920656502
from sklearn.tree import DecisionTreeClassifier
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train,Y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
Y_pred = d_tree.predict(X_test)
Y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
accuracy = accuracy_score(Y_test,Y_pred)
accuracy
0.8864717878993882
from sklearn.metrics import classification_report,accuracy_score,f1_score,recall_score
CR = classification_report(Y_test,Y_pred)
CR
' precision recall f1-score support\n\n 0 0.94 0.94 0.94 9144\n 1 0.49 0.50 0.50 1153\n\n accuracy 0.89 10297\n macro avg 0.72 0.72 0.72 10297\nweighted avg 0.89 0.89 0.89 10297\n'
F1_score = f1_score(Y_test,Y_pred)
F1_score
0.4972043010752688
pwd
'C:\\Users\\amanp\\PORTUGUESE_BANK_PROJECT_1'
import plotly.express as px
# for html export
import plotly.io as pio
pio.renderers.default='notebook'
# for pdf export
!pip install Pyppeteer
!pyppeteer-install
Collecting Pyppeteer
Downloading pyppeteer-1.0.2-py3-none-any.whl (83 kB)
---------------------------------------- 83.4/83.4 kB 4.9 MB/s eta 0:00:00
Requirement already satisfied: importlib-metadata>=1.4 in c:\users\amanp\anaconda3\lib\site-packages (from Pyppeteer) (4.11.3)
Requirement already satisfied: urllib3<2.0.0,>=1.25.8 in c:\users\amanp\anaconda3\lib\site-packages (from Pyppeteer) (1.26.14)
Collecting websockets<11.0,>=10.0
Downloading websockets-10.4-cp310-cp310-win_amd64.whl (101 kB)
---------------------------------------- 101.4/101.4 kB ? eta 0:00:00
Requirement already satisfied: tqdm<5.0.0,>=4.42.1 in c:\users\amanp\anaconda3\lib\site-packages (from Pyppeteer) (4.64.1)
Collecting pyee<9.0.0,>=8.1.0
Downloading pyee-8.2.2-py2.py3-none-any.whl (12 kB)
Requirement already satisfied: certifi>=2021 in c:\users\amanp\anaconda3\lib\site-packages (from Pyppeteer) (2022.12.7)
Requirement already satisfied: appdirs<2.0.0,>=1.4.3 in c:\users\amanp\anaconda3\lib\site-packages (from Pyppeteer) (1.4.4)
Requirement already satisfied: zipp>=0.5 in c:\users\amanp\anaconda3\lib\site-packages (from importlib-metadata>=1.4->Pyppeteer) (3.11.0)
Requirement already satisfied: colorama in c:\users\amanp\anaconda3\lib\site-packages (from tqdm<5.0.0,>=4.42.1->Pyppeteer) (0.4.6)
Installing collected packages: pyee, websockets, Pyppeteer
Successfully installed Pyppeteer-1.0.2 pyee-8.2.2 websockets-10.4
[INFO] Starting Chromium download. 0%| | 0.00/137M [00:00<?, ?b/s] 0%| | 41.0k/137M [00:00<08:21, 273kb/s] 0%| | 71.7k/137M [00:00<09:55, 230kb/s] 0%| | 102k/137M [00:00<12:08, 188kb/s] 0%| | 174k/137M [00:00<06:54, 330kb/s] 0%| | 236k/137M [00:00<05:46, 394kb/s] 0%| | 328k/137M [00:00<04:17, 531kb/s] 0%| | 512k/137M [00:00<02:35, 875kb/s] 1%| | 717k/137M [00:01<01:53, 1.20Mb/s] 1%| | 1.05M/137M [00:01<01:15, 1.81Mb/s] 1%|1 | 1.53M/137M [00:01<00:51, 2.64Mb/s] 2%|1 | 2.21M/137M [00:01<00:34, 3.86Mb/s] 2%|2 | 3.24M/137M [00:01<00:23, 5.72Mb/s] 3%|3 | 4.34M/137M [00:01<00:18, 7.29Mb/s] 4%|3 | 5.09M/137M [00:01<00:18, 7.13Mb/s] 4%|4 | 6.06M/137M [00:01<00:16, 7.87Mb/s] 5%|5 | 7.03M/137M [00:01<00:15, 8.28Mb/s] 6%|5 | 8.08M/137M [00:01<00:14, 8.87Mb/s] 7%|6 | 9.03M/137M [00:02<00:14, 9.05Mb/s] 7%|7 | 9.98M/137M [00:02<00:13, 9.16Mb/s] 8%|8 | 11.0M/137M [00:02<00:13, 9.48Mb/s] 9%|8 | 12.0M/137M [00:02<00:12, 9.64Mb/s] 10%|9 | 13.1M/137M [00:02<00:12, 9.76Mb/s] 10%|# | 14.3M/137M [00:02<00:12, 10.2Mb/s] 11%|#1 | 15.5M/137M [00:02<00:11, 10.7Mb/s] 12%|#2 | 16.6M/137M [00:02<00:11, 10.7Mb/s] 13%|#2 | 17.7M/137M [00:02<00:11, 10.8Mb/s] 14%|#3 | 18.7M/137M [00:02<00:10, 10.8Mb/s] 14%|#4 | 19.8M/137M [00:03<00:10, 10.8Mb/s] 15%|#5 | 20.9M/137M [00:03<00:10, 10.8Mb/s] 16%|#6 | 22.0M/137M [00:03<00:10, 10.9Mb/s] 17%|#6 | 23.1M/137M [00:03<00:10, 10.6Mb/s] 18%|#7 | 24.3M/137M [00:03<00:10, 10.9Mb/s] 19%|#8 | 25.4M/137M [00:03<00:10, 10.5Mb/s] 19%|#9 | 26.7M/137M [00:03<00:09, 11.1Mb/s] 20%|## | 27.8M/137M [00:03<00:09, 11.0Mb/s] 21%|##1 | 28.9M/137M [00:03<00:09, 11.0Mb/s] 22%|##1 | 30.0M/137M [00:04<00:09, 10.9Mb/s] 23%|##2 | 31.1M/137M [00:04<00:09, 10.9Mb/s] 24%|##3 | 32.2M/137M [00:04<00:09, 10.9Mb/s] 24%|##4 | 33.3M/137M [00:04<00:09, 10.9Mb/s] 25%|##5 | 34.5M/137M [00:04<00:09, 10.9Mb/s] 26%|##5 | 35.6M/137M [00:04<00:09, 10.8Mb/s] 27%|##6 | 36.6M/137M [00:04<00:09, 10.7Mb/s] 28%|##7 | 37.8M/137M [00:04<00:09, 10.9Mb/s] 28%|##8 | 38.9M/137M [00:04<00:08, 10.9Mb/s] 29%|##9 | 40.0M/137M [00:04<00:08, 10.9Mb/s] 30%|### | 41.1M/137M [00:05<00:08, 10.9Mb/s] 31%|### | 42.2M/137M [00:05<00:08, 10.9Mb/s] 32%|###1 | 43.3M/137M [00:05<00:08, 10.9Mb/s] 32%|###2 | 44.4M/137M [00:05<00:08, 10.9Mb/s] 33%|###3 | 45.5M/137M [00:05<00:08, 10.8Mb/s] 34%|###4 | 46.6M/137M [00:05<00:08, 10.9Mb/s] 35%|###4 | 47.7M/137M [00:05<00:08, 10.9Mb/s] 36%|###5 | 48.8M/137M [00:05<00:08, 10.8Mb/s] 36%|###6 | 49.9M/137M [00:05<00:08, 9.85Mb/s] 37%|###7 | 50.9M/137M [00:06<00:10, 8.25Mb/s] 38%|###8 | 52.1M/137M [00:06<00:09, 9.09Mb/s] 39%|###8 | 53.2M/137M [00:06<00:08, 9.57Mb/s] 40%|###9 | 54.3M/137M [00:06<00:08, 9.87Mb/s] 40%|#### | 55.4M/137M [00:06<00:07, 10.2Mb/s] 41%|####1 | 56.5M/137M [00:06<00:07, 10.4Mb/s] 42%|####2 | 57.6M/137M [00:06<00:07, 10.5Mb/s] 43%|####2 | 58.7M/137M [00:06<00:07, 10.6Mb/s] 44%|####3 | 59.8M/137M [00:06<00:07, 10.7Mb/s] 44%|####4 | 60.9M/137M [00:06<00:07, 10.7Mb/s] 45%|####5 | 62.0M/137M [00:07<00:06, 10.8Mb/s] 46%|####6 | 63.1M/137M [00:07<00:06, 10.8Mb/s] 47%|####6 | 64.2M/137M [00:07<00:06, 10.8Mb/s] 48%|####7 | 65.2M/137M [00:07<00:06, 10.8Mb/s] 48%|####8 | 66.4M/137M [00:07<00:06, 10.9Mb/s] 49%|####9 | 67.5M/137M [00:07<00:06, 10.9Mb/s] 50%|##### | 68.6M/137M [00:07<00:06, 10.9Mb/s] 51%|##### | 69.7M/137M [00:07<00:06, 10.9Mb/s] 52%|#####1 | 70.8M/137M [00:07<00:06, 10.6Mb/s] 53%|#####2 | 72.0M/137M [00:07<00:05, 11.0Mb/s] 53%|#####3 | 73.1M/137M [00:08<00:05, 11.0Mb/s] 54%|#####4 | 74.2M/137M [00:08<00:05, 10.9Mb/s] 55%|#####4 | 75.3M/137M [00:08<00:05, 10.8Mb/s] 56%|#####5 | 76.4M/137M [00:08<00:05, 10.9Mb/s] 57%|#####6 | 77.5M/137M [00:08<00:05, 10.9Mb/s] 57%|#####7 | 78.6M/137M [00:08<00:05, 10.9Mb/s] 58%|#####8 | 79.7M/137M [00:08<00:05, 10.6Mb/s] 59%|#####9 | 80.9M/137M [00:08<00:05, 11.0Mb/s] 60%|#####9 | 82.0M/137M [00:08<00:05, 11.0Mb/s] 61%|###### | 83.1M/137M [00:09<00:04, 10.9Mb/s] 62%|######1 | 84.2M/137M [00:09<00:04, 10.9Mb/s] 62%|######2 | 85.3M/137M [00:09<00:04, 10.9Mb/s] 63%|######3 | 86.4M/137M [00:09<00:04, 10.9Mb/s] 64%|######3 | 87.5M/137M [00:09<00:04, 10.9Mb/s] 65%|######4 | 88.6M/137M [00:09<00:04, 10.9Mb/s] 66%|######5 | 89.7M/137M [00:09<00:04, 10.9Mb/s] 66%|######6 | 90.8M/137M [00:09<00:04, 10.9Mb/s] 67%|######7 | 91.9M/137M [00:09<00:04, 10.9Mb/s] 68%|######7 | 93.0M/137M [00:09<00:04, 10.9Mb/s] 69%|######8 | 94.1M/137M [00:10<00:03, 10.9Mb/s] 70%|######9 | 95.2M/137M [00:10<00:03, 10.9Mb/s] 70%|####### | 96.3M/137M [00:10<00:03, 10.9Mb/s] 71%|#######1 | 97.4M/137M [00:10<00:03, 10.9Mb/s] 72%|#######1 | 98.5M/137M [00:10<00:03, 10.9Mb/s] 73%|#######2 | 99.6M/137M [00:10<00:03, 10.9Mb/s] 74%|#######3 | 101M/137M [00:10<00:03, 10.5Mb/s] 74%|#######4 | 102M/137M [00:10<00:03, 11.0Mb/s] 75%|#######5 | 103M/137M [00:10<00:03, 11.0Mb/s] 76%|#######6 | 104M/137M [00:10<00:02, 11.0Mb/s] 77%|#######6 | 105M/137M [00:11<00:02, 11.0Mb/s] 78%|#######7 | 106M/137M [00:11<00:03, 8.59Mb/s] 78%|#######8 | 107M/137M [00:11<00:03, 9.05Mb/s] 79%|#######9 | 108M/137M [00:11<00:02, 9.53Mb/s] 80%|######## | 110M/137M [00:11<00:02, 9.84Mb/s] 81%|######## | 111M/137M [00:11<00:02, 10.2Mb/s] 82%|########1 | 112M/137M [00:11<00:02, 10.4Mb/s] 82%|########2 | 113M/137M [00:11<00:02, 9.15Mb/s] 84%|########3 | 115M/137M [00:11<00:02, 11.1Mb/s] 85%|########4 | 116M/137M [00:12<00:01, 10.9Mb/s] 85%|########5 | 117M/137M [00:12<00:01, 10.9Mb/s] 86%|########6 | 118M/137M [00:12<00:01, 10.9Mb/s] 87%|########6 | 119M/137M [00:12<00:01, 10.9Mb/s] 88%|########7 | 120M/137M [00:12<00:01, 10.9Mb/s] 89%|########8 | 121M/137M [00:12<00:01, 10.9Mb/s] 89%|########9 | 122M/137M [00:12<00:01, 10.9Mb/s] 90%|######### | 124M/137M [00:12<00:01, 10.9Mb/s] 91%|#########1| 125M/137M [00:12<00:01, 10.9Mb/s] 92%|#########1| 126M/137M [00:13<00:01, 10.9Mb/s] 93%|#########2| 127M/137M [00:13<00:01, 8.68Mb/s] 94%|#########4| 129M/137M [00:13<00:00, 11.6Mb/s] 95%|#########5| 130M/137M [00:13<00:00, 11.4Mb/s] 96%|#########5| 131M/137M [00:13<00:00, 11.3Mb/s] 97%|#########6| 133M/137M [00:13<00:00, 11.2Mb/s] 98%|#########7| 134M/137M [00:13<00:00, 11.1Mb/s] 98%|#########8| 135M/137M [00:13<00:00, 11.0Mb/s] 99%|#########9| 136M/137M [00:13<00:00, 11.0Mb/s] 100%|##########| 137M/137M [00:14<00:00, 9.75Mb/s] [INFO] Beginning extraction [INFO] Chromium extracted to: C:\Users\amanp\AppData\Local\pyppeteer\pyppeteer\local-chromium\588429